###################
###################
## Data gathering and preparation code for 
## Click, click boom: Using Wikipedia data to predict changes in battle-related deaths
## Christian Oswald, Daniel Ohrenhofer
###################
###################


## clear environment
rm(list = ls())

## set working directory
# setwd('/home/default/prediction-project/own_dat')

## load libraries
library(dplyr)
library(ggplot2)
library(magrittr)
library(lubridate)
library(stringr)
library(wikipediatrend) # get wikipedia page views
## make sure to install this version (1.2)
#devtools::install_github("fxjollois/WikipediaR")
library(WikipediaR) # get wikipedia links and contributions

## set start and end date for data gathering
## set start date here to 2017, 2008-2016 comes from petermeissner website
## thereafter from wikipedia website
## for complete replication of data gathering use
start_date <- "2008-01-01"
## changed here due to timeouts etc from petermeissner website
#start_date <- "2017-01-01"
end_date <- "2020-08-31"


## list of countries to be considered
## list taken from https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
## as of now misses Swaziland as separate entry, dealth with below
countries <- c("Algeria",
               "Angola",
               "Benin",
               "Botswana",
               "Burkina_Faso",
               "Burundi",
               "Cameroon",
               "Cape_Verde",
               "Central_African_Republic",
               "Chad",
               "Comoros",
               "Democratic_Republic_of_the_Congo",
               "Republic_of_the_Congo",
               "Ivory_Coast",
               "Djibouti",
               "Egypt",
               "Equatorial_Guinea",
               "Eritrea",
               "Eswatini",
               "Ethiopia",
               "Gabon",
               "The_Gambia",
               "Ghana",
               "Guinea",
               "Guinea-Bissau",
               "Kenya",
               "Lesotho",
               "Liberia",
               "Libya",
               "Madagascar",
               "Malawi",
               "Mali",
               "Mauritania",
               "Mauritius",
               "Morocco",
               "Mozambique",
               "Namibia",
               "Niger",
               "Nigeria",
               "Rwanda",
               "São_Tomé_and_Príncipe",
               "Senegal",
               "Seychelles",
               "Sierra_Leone",
               "Somalia",
               "South_Africa",
               "South_Sudan",
               "Sudan",
               # "Swaziland", 
               "Tanzania",
               "Togo",
               "Tunisia",
               "Uganda",
               "Zambia",
               "Zimbabwe")


###################
###################
## get page views of country pages
###################
###################


page_views <- NULL
page_views_all <- NULL

for (i in countries) {
  ## if clause to deal with the fact that swaziland was renamed eswatini and redirection does not work here
  if (i == "Eswatini"){
    page_views <- wp_trend(page = c( "Swaziland", "Eswatini"), 
                           from = start_date, 
                           to = end_date,
                           lang = "en", 
                           warn =T)
    page_views <- page_views %>%
      # either works
      mutate(month = month(date), year = year(date)) %>%
      # mutate(month = format(date, "%m"), year = format(date, "%Y")) %>%
      select(-date) %>%
      group_by(month, year) %>%
      #      group_by(date) %>%
      summarise(views = sum(views),
                # language = "en",
                article = "Eswatini") %>%
      ungroup()
    print(i)
    page_views_all <- rbind(page_views_all, page_views)
    ## for every other country except eswatini/swaziland
  } else {
    page_views <- wp_trend(page = i, 
                           from = start_date, 
                           to = end_date, 
                           lang = "en",
                           warn = T)
    page_views <- page_views %>%
      # mutate(month = month(date), year = year(date)) %>%
      mutate(month = format(date, "%m"), year = format(date, "%Y")) %>%
      select(-date) %>%
      group_by(article, month, year)%>%
      summarise(views = sum(views)) %>%
      ungroup()
    # group_by(month, year)
    print(i)
    page_views_all <- rbind(page_views_all, page_views)
  }
}


## save all raw page views
# write.csv(page_views_all, file = 'data/page_views_all_2008_present.csv')






###################
###################
## retrieve contributions
## via WikipediaR package
###################
###################


## contributions for country pages
contribs_country <- NULL
contribs_country_all <- NULL
for (i in countries) {
  contribs_country <- cbind(contribs(i, rvprop = 'user|timestamp|size')$contribs, i)
  print(i)
  contribs_country_all <- rbind(contribs_country_all, contribs_country)
}


## rename i to article for merging later
names(contribs_country_all)[names(contribs_country_all)=='i'] = 'article'
# ## remove first NA row if necessary
# contribs_country_all = contribs_country_all[-1,]

contribs_country_all <- contribs_country_all %>%
  #  group_by(article) %>%
  mutate(date = as_date(timestamp)) %>%
  ## both work
  mutate(month = format(date, "%m"), year = format(date, "%Y"))
# mutate(month = month(date), year = year(date))

## get the number of unique users per day
nuser <- select(contribs_country_all, article, month, year, user)
nuser <- unique(nuser)
nuser <- nuser %>%
  group_by(article, month, year) %>%
  summarize(nuser = n())

## get number of daily revisions, number of users revising and size of revisions
contribs_country_all$size <- as.numeric(paste(contribs_country_all$size))
contribs_country_all <- contribs_country_all %>%
  #  group_by(article, date) %>%
  group_by(article, month, year) %>%
  summarise(revisions = n(),
            nsize = sum(size))

## merge it with nuser
contribs_country_all <- merge(contribs_country_all, nuser, by = c("article", "month", "year"))
rm(nuser)

#ungroup and filter by date
contribs_country_all <- contribs_country_all %>%
  ungroup() %>%
  ## get data from 2008 onwards to match with page views
  # filter(date >= start_date)
  filter(year >= 2008)


# write.csv(contribs_country_all, file = 'data/contribs_country_page_all_2008_present.csv')



## for pages linked to country page


## issue with country names with more than one word, dealt with below
countries2 <- c("Burkina_Faso", "Cape_Verde", "Central_African_Republic", "Democratic_Republic_of_the_Congo", "Republic_of_the_Congo", "Ivory_Coast", "Equatorial_Guinea", "The_Gambia", "Guinea-Bissau", "São_Tomé_and_Príncipe", "Sierra_Leone", "South_Africa", "South_Sudan")
countries <- setdiff(countries, countries2)
links <- NULL
contribs_links <- NULL
contribs_links_all <- NULL
for (i in countries) {
  links <- links(i)$links %>%
    filter(grepl(i, as.character(title)),
           ## only main articles
           nssubj == 'Main Article',
           ## delete country page
           title != i) %>%
    mutate(article = i)
  print(i)
  datalist = list()
  for (j in links$title){
    print(j)
    contribs_j <- contribs(j, rvprop = 'user|timestamp|size')$contribs
    ## avoid having links which were not revised to enable adding linked page name
    if(is.null(contribs_j) == FALSE){
      datalist[[j]] <- cbind(contribs_j, j)
    }
  }
  contribs_links = do.call(rbind, datalist)
  contribs_links <- contribs_links[!is.na(contribs_links$timestamp), ]
  contribs_links$size <- as.numeric(paste(contribs_links$size))
  contribs_links <- contribs_links %>%
    mutate(date = as_date(timestamp)) %>%
    # both work
    # mutate(month = month(date), year = year(date)) %>%
    mutate(month = format(date, "%m"), year = format(date, "%Y")) %>%
    group_by(month, year) %>%
    # group_by(date) %>%
    ## get number of daily revisions, number of users revising and size of revisions
    summarise(revisions = n(),
              nuser = length(unique(user)),
              nsize = sum(size),
              narticle = length(unique(j))) %>%
    ungroup() %>%
    ## get data from 2008 onwards to match with page views
    filter(year >= 2008) %>%
    mutate(article = i)
  contribs_links_all <- rbind(contribs_links_all, contribs_links)
}


#for countries with more than one word
countries2_label <- str_replace_all(countries2, "_", " ")
countries2_label[1] <- "Burkina"
countries2_label[4] <- "Congo"
countries2_label[5] <- "Congo"
countries2_label[6] <- "Ivor"
countries2_label[8] <- "Gambia"
countries2_label[10] <- "São Tomé"
countries2_label[13] <- "Sudan"

for (i in 1:length(countries2)) {
  links <- links(countries2[i])$links %>%
    filter(grepl(countries2_label[i], as.character(title)),
           ## only main articles
           nssubj == 'Main Article',
           ## delete country page
           title != countries2[i]) %>%
    mutate(article = countries2[i])
  print(countries2[i])
  datalist = list()
  for (j in links$title){
    print(j)
    contribs_j <- contribs(j, rvprop = 'user|timestamp|size')$contribs
    ## avoid having links which were not revised to enable adding linked page name
    if(is.null(contribs_j) == FALSE){
      datalist[[j]] <- cbind(contribs_j, j)
    }
  }
  contribs_links = do.call(rbind, datalist)
  contribs_links <- contribs_links[!is.na(contribs_links$timestamp), ]
  contribs_links$size <- as.numeric(paste(contribs_links$size))
  contribs_links <- contribs_links %>%
    mutate(date = as_date(timestamp)) %>%
    # both work
    # mutate(month = month(date), year = year(date)) %>%
    mutate(month = format(date, "%m"), year = format(date, "%Y")) %>%
    # for daily observations
    group_by(month, year) %>%
    # group_by(date) %>%
    ## get number of daily revisions, number of users revising and size of revisions
    summarise(revisions = n(),
              nuser = length(unique(user)),
              nsize = sum(size),
              narticle = length(unique(j))) %>%
    ungroup() %>%
    ## get data from 2008 onwards to match with page views
    filter(year >= 2008) %>%
    mutate(article = countries2[i])
  contribs_links_all <- rbind(contribs_links_all, contribs_links)
}

# write.csv(contribs_links_all, file = 'data/contribs_country_links_all_2008_present.csv')





## merge data, add missing country months



## 54 countries, 12*12 + 8 = 152 country-months
## 54 * 152 = 8,208 observations


## read in data and merge into one file
page_views_all <- read.csv("data/page_views_all_2008_present.csv") %>%
  select(-X)

## for merging with other data
page_views_all$yearmonth <- str_c(page_views_all$year, page_views_all$month, sep = "-")
page_views_all$article <- tolower(page_views_all$article)

contribs_country_all <- read.csv("data/contribs_country_page_all_2008_present.csv") %>%
  select(-X)

## add cc-prefix for country contributions
colnames(contribs_country_all) <- c("article", "month", "year", "ccrevisions", "ccnsize", "ccnuser")
## for merging with other data
contribs_country_all$article <- tolower(contribs_country_all$article)
contribs_country_all$yearmonth <- str_c(contribs_country_all$year, contribs_country_all$month, sep = "-")

contribs_links_all <- read.csv('data/contribs_country_links_all_2008_present.csv') %>%
  select(-X)

## add cl-prefix for country contributions
colnames(contribs_links_all) <- c("month", "year", "clrevisions", "cluser", "clnsize", "clnarticle", "article")
## for merging with other data
contribs_links_all$article <- tolower(contribs_links_all$article)
contribs_links_all$yearmonth <- str_c(contribs_links_all$year, contribs_links_all$month, sep = "-")

## in case there was a mix up with end dates
# page_views_all <- page_views_all %>%
#   filter(yearmonth <= "2020-8")
# contribs_country_all <- contribs_country_all %>%
#   filter(yearmonth <= "2020-8")
# contribs_links_all <- contribs_links_all %>%
#   filter(yearmonth <= "2020-8")


## search where country-months are missing

## creates the output with countries below
## for which observations with 0s have to be added

# test <- contribs_country_all %>%
#   group_by(article) %>%
#   mutate(obs = length(yearmonth)) %>%
#   ungroup() %>%
#   filter(obs != 152)
# unique(test$article)
# table(test$article, test$obs)
# test <- test[order(test$article, test$yearmonth),]


# benin                      0   0 150   0
# burundi                    0   0   0 151
# central_african_republic   0   0   0 151
# chad                       0   0   0 151
# comoros                    0   0 150   0
# eswatini                   0   0   0 151
# gabon                      0 149   0   0
# guinea                     0   0   0 151
# guinea-bissau              0   0   0 151
# lesotho                    0   0   0 151
# malawi                     0   0   0 151
# mali                       0   0 150   0
# mauritania                 0   0   0 151
# mozambique                 0   0   0 151
# niger                    145   0   0   0
# são_tomé_and_príncipe      0   0 150   0
# seychelles                 0   0   0 151
# togo                       0   0   0 151

tomerge <- data.frame(article = c("benin","benin", "burundi", "central_african_republic", "chad", "comoros", "comoros", "eswatini", "gabon", "gabon", "gabon", "guinea", "guinea-bissau", "lesotho", "malawi", "mali", "mali", "mauritania", "mozambique", "niger", "niger", "niger", "niger", "niger", "niger", "niger", "são_tomé_and_príncipe", "são_tomé_and_príncipe", "seychelles", "togo"),
                      month = c(6, 9, 2, 9, 4, 5, 2, 6, 2, 9, 3, 4, 10, 6, 4, 11, 8, 1, 11, 6, 9, 11, 1, 5, 7, 2, 10, 8, 7, 4),
                      year = c(2013, 2016, 2017, 2013, 2020, 2018, 2020, 2016, 2014, 2014, 2020, 2015, 2013, 2016, 2015, 2013, 2017, 2014, 2015, 2013, 2013, 2013, 2014, 2014, 2015, 2020, 2013, 2017, 2016, 2015),
                      # ccrevisions = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                      ccrevisions = rep(0, 30),
                      # ccnsize = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                      # ccnuser = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                      ccnsize = rep(0, 30),
                      ccnuser = rep(0, 30),
                      yearmonth = c("2013-6", "2016-9", "2017-2", "2013-9", "2020-4", "2018-5", "2020-2", "2016-6", "2014-2", "2014-9", "2020-3", "2015-4", "2013-10", "2016-6", "2015-4", "2013-11", "2017-8", "2014-1", "2015-11", "2013-6", "2013-9", "2013-11", "2014-1", "2014-5", "2015-7", "2020-2", "2013-10", "2017-8", "2016-7", "2015-4"))
contribs_country_all <- rbind(contribs_country_all, tomerge)
rm(tomerge)


test <- merge(contribs_country_all, page_views_all, by = c("article", "year", "month", "yearmonth"), all = T)
contribs_links_all[contribs_links_all == "sã£o_tomã©_and_prã­ncipe"] <-  "são_tomé_and_príncipe" 
test2 <- merge(test, contribs_links_all, by = c("article", "year", "month", "yearmonth"), all = T)

## save the final version of the data
# write.csv(test2, file = 'data/full_data_2008_present.csv')


## add variables to make our data compatible with views data

## create monthID and countryID

library(stringr)
dat <- read.csv("/home/default/Dokumente/TRINITY/HT2020/comp/prediction-project/data/full_data_2008_present.csv")
cntryID <- read.csv("/home/default/Dokumente/TRINITY/HT2020/comp/prediction-project/countryID.csv")
colnames(cntryID) <- c("country_id", "article")


dat$article <- gsub("_", " ", dat$article)
dat$article <- str_to_title(dat$article)

dat$article <- gsub("Democratic Republic Of The Congo", "Congo, DRC", dat$article)
dat$article <- gsub("Ivory Coast", "Cote d'Ivoire", dat$article)
dat$article <- gsub("S\U3e33653cO Tom\U3e39653c And Pr\U3e64653cNcipe", "Sao Tome and Principe", dat$article)
dat$article <- gsub("Republic Of The Congo", "Congo", dat$article)
dat$article <- gsub("Eswatini", "Swaziland", dat$article)

dat <- merge(dat, cntryID, by=c("article"))

#dat$monthID <- rep(0, length(dat$month))
test <- split(dat, dat$year)


id <- as.data.frame(cbind(seq(337, (337+11)), seq(1,12)))
colnames(id) <- c("month_id", "month")
dat2 <- c()

for (i in 1:length(test)) {
  test[[i]] <- merge(test[[i]],id, by = 'month', all.y=TRUE)
  id$month_id <- id$month_id+12
  dat2 <- rbind(dat2,test[[i]])
}

dat2 <- dat2[1:8208,]

colnames(dat2)[2] <- "country_name"

#write.csv(dat2, file = '/home/default/Dokumente/TRINITY/HT2020/comp/prediction-project/data/full_data_NEW_2008_present.csv')



### Include transformations
#write.csv(dat2, file = '/home/default/Dokumente/TRINITY/HT2020/comp/prediction-project/data/full_data_NEW_2008_present.csv')
dat2 = read.csv( file = '/home/default/Dokumente/TRINITY/HT2020/comp/prediction-project/data/full_data_NEW_2008_present.csv')
dat2 = dat2[,-c(1,4)]


# Revisions per views
dat2$ccrev_per_views <- dat2$ccrevisions/dat2$views

# Size per revisions
dat2$ccsize_per_rev <- dat2$ccnsize/dat2$ccrevisions

# size of of revisions per user
dat2$ccsize_per_user <- dat2$ccnsize/dat2$ccnuser

# average revisions per user
dat2$ccrev_per_user <- dat2$ccrevisions/dat2$ccnuser

# revisions  per article in linked pages
dat2$clrev_per_article <- dat2$clrevisions/dat2$clnarticle

# size of of revisions per user in linked pages
dat2$clsize_per_user <- dat2$clnsize/dat2$cluser

# average revisions per user
dat2$clrev_per_user <- dat2$clrevisions/dat2$cluser

# average users per article
dat2$cluser_per_article <- dat2$clnarticle/dat2$cluser


write.csv(dat2, file = '/home/default/Dokumente/TRINITY/HT2020/comp/prediction-project/data/data_final_2008_present.csv')





## page views for french language


dat3 = read.csv(file = '/home/default/Dokumente/TRINITY/comp/prediction-project/data/data_final_2008_present.csv')

dat3_countries = unique(dat3$country_name)
start_date <- "2008-01-01"
end_date <- "2020-08-31"


countries <- c("Algeria",
               "Angola",
               "Benin",
               "Botswana",
               "Burkina_Faso",
               "Burundi",
               "Cameroon",
               "Cape_Verde",
               "Central_African_Republic",
               "Chad",
               "Comoros",
               "Democratic_Republic_of_the_Congo",
               "Republic_of_the_Congo",
               "Ivory_Coast",
               "Djibouti",
               "Egypt",
               "Equatorial_Guinea",
               "Eritrea",
               "Eswatini",
               "Ethiopia",
               "Gabon",
               "The_Gambia",
               "Ghana",
               "Guinea",
               "Guinea-Bissau",
               "Kenya",
               "Lesotho",
               "Liberia",
               "Libya",
               "Madagascar",
               "Malawi",
               "Mali",
               "Mauritania",
               "Mauritius",
               "Morocco",
               "Mozambique",
               "Namibia",
               "Niger",
               "Nigeria",
               "Rwanda",
               "São_Tomé_and_Príncipe",
               "Senegal",
               "Seychelles",
               "Sierra_Leone",
               "Somalia",
               "South_Africa",
               "South_Sudan",
               "Sudan",
               # "Swaziland", 
               "Tanzania",
               "Togo",
               "Tunisia",
               "Uganda",
               "Zambia",
               "Zimbabwe")



countriesFR <- rep(0, length(dat3_countries))
countriesEN <- rep(0, length(dat3_countries))
j = 1
for (i in countries) {
  test <- wp_linked_pages(i, "en")
  test <- test[test[,2] == 'fr',]
  countriesFR[j] = test[3]
  countriesEN[j] = i
  j = j + 1
}
countriesFR <- unlist(countriesFR)
countriesEN <- unlist(countriesEN)



page_views <- NULL
page_views_all <- NULL
for (i in countriesFR) {
  
  ## if clause to deal with the fact that swaziland was renamed eswatini and redirection does not work here
  if (i == "Eswatini"){
    page_views <- wp_trend(page = c( "Swaziland", "Eswatini"),
                           from = start_date,
                           to = end_date,
                           lang = "fr",
                           warn =T)
    page_views <- page_views %>%
      group_by(date) %>%
      summarise(views = sum(views),
                language = "fr",
                article = "eswatini") %>%
      ungroup()
    print(i)
    page_views_all <- rbind(page_views_all, page_views)
    ## for every other country except eswatini/swaziland
  } else {
    page_views <- wp_trend(page = i, 
                           from = start_date, 
                           to = end_date, 
                           lang = "fr", 
                           warn = T)
    print(i)
    page_views_all <- rbind(page_views_all, page_views)
  }
}

page_views_all_FR <- page_views_all


page_views_all_FR <- page_views_all_FR %>%
  mutate(month = format(date, "%m"), year = format(date, "%Y"))
page_views_all_FR$yearmonth <- str_c(page_views_all_FR$year, page_views_all_FR$month, sep = "-")


page_views_all_FR <- page_views_all_FR %>%   
  group_by(yearmonth, article) %>% 
  summarize(viewsFrench = sum(views))

test <- as.data.frame(cbind(countriesFR, countriesEN))
test[,1] <- tolower(test[,1])
page_views_all_FR <- merge(page_views_all_FR, test, by.x = "article", by.y = "countriesFR")


page_views_all_FR <- page_views_all_FR[,-1]
colnames(page_views_all_FR)[colnames(page_views_all_FR) == "countriesEN"] <- "article"
page_views_all_FR


page_views_all_FR <- page_views_all_FR[order(page_views_all_FR$yearmonth),]
page_views_all_FR <- page_views_all_FR[order(page_views_all_FR$article),]

diffFR <- rep(0, nrow(page_views_all_FR))

j=1
x <- length(unique(page_views_all_FR$article))
for (i in 1:x) {
  test <- page_views_all_FR[j:(j+151),]
  diffFR[(j+1):(j+150)] <- diff(test$viewsFrench)
  j = j+151
}


colnames(dat3)[colnames(dat3) == "views"] <- "viewsEnglish"

viewsEnglish = dat3$viewsEnglish

diffEN <- rep(0, nrow(dat3))

j=1
x <- length(unique(dat3$country_name))
for (i in 1:x) {
  test <- dat3[j:(j+151),]
  diffEN[(j+1):(j+150)] <- diff(test$viewsEnglish)
  j = j+151
}


page_views_all_FR <- as.data.frame(cbind(page_views_all_FR, diffEN, diffFR))




## english-speaking countries

ENspeaking <- c("Nigeria", "Ethiopia", "South Africa", "Tanzania", "Kenya", "Sudan", "Uganda", "Ghana",
                "Cameroon", "Malawi", "Zambia", "Zimbabwe", "South Sudan", "Rwanda", "Burundi"," Sierra Leone",
                "Liberia", "Namibia", "Lesotho", "Botswana", "The Gambia", "Mauritius", "Eswatini",
                "Seychelles")

en <- as.data.frame(data.frame(page_views_all_FR$article , rep(0,length(page_views_all_FR$article))))
colnames(en) <- c("article", "ENspeaking")

for(i in 1:nrow(en)){
  if(en$article[i] %in% ENspeaking){
    en$ENspeaking[i] = 1
  }
}


## french-speaking countries

FRspeaking <- c("Democratic Republic of the Congo", "Madagascar", "Cameroon", "Ivory Coast", "Niger", 
                "Burkina Faso", "Mali", "Senegal", "Chad", "Guinea", "Rwanda", "Burundi", "Benin", "Togo", 
                "Central Afican Republic", "Republic of the Congo", "Gabon", "Djibouti", "Equatorial Guinea", 
                "Comoros", "Seychelles")

fr <- as.data.frame(data.frame(page_views_all_FR$article , rep(0,length(page_views_all_FR$article))))
colnames(fr) <- c("article", "FRspeaking")

for(i in 1:nrow(fr)){
  if(fr$article[i] %in% FRspeaking){
    fr$FRspeaking[i] = 1
  }
}


## arab spring

ArabSpring <- c("Tunisia", "Egypt", "Libya" )

ar <- as.data.frame(data.frame(page_views_all_FR$article , rep(0,length(page_views_all_FR$article))))
colnames(ar) <- c("article", "ArabSpring")

for(i in 1:nrow(ar)){
  if(ar$article[i] %in% ArabSpring){
    ar$ArabSpring[i] = 1
  }
}



page_views_all_FR <- data.frame(page_views_all_FR, en, fr, ar)
page_views_all_FR <- subset(page_views_all_FR, select = -c(article.1, article.2, article.3))


dat3 <-dat3[order(dat3$country_name),]
dat3 <- data.frame(dat3, page_views_all_FR)

Wiki_Data_Final <- subset(dat3, select = -c(yearmonth.1, article, X))

Wiki_Data_Final <- Wiki_Data_Final %>% select(month,month_id, year, yearmonth, country_name, country_id,
                                              viewsEnglish, diffEN, viewsFrench, diffFR,everything())


#write.csv(Wiki_Data_Final, file = 'data/Wiki_Data_Final.csv')



## summary statistics

## full data
full_data <- read.csv("data/full_data_2008_present.csv") %>%
  select(-X)
## data final
data_final <- read.csv("data/data_final_2008_present.csv") %>%
  select(-X) %>%
  mutate_all(~replace(., is.na(.), 0))
sum(is.na(data_final))



descp <- data_final %>% select(-c(month, country_name, year, yearmonth, country_id, month_id))
summary(descp)

stargazer::stargazer(descp,
                     digits = 3,
                     omit.summary.stat = c("p25", "p75")
)

